{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  del sys.path[0]\n"
     ]
    }
   ],
   "source": [
    "# MAE: 528.190\n",
    "# XGBoostRegressor\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import KFold,GridSearchCV\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "train = pd.read_csv('../data/used_car_train_20200313.csv', sep=' ')\n",
    "test = pd.read_csv('../data/used_car_testB_20200421.csv', sep=' ')\n",
    "\n",
    "# 合并训练数据和测试数据集\n",
    "all_data = pd.concat([train, test], ignore_index=True)\n",
    "\n",
    "# 对 price 做对数变换\n",
    "all_data['price'] = np.log1p(all_data['price'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理异常值，如功率大于 600 的值\n",
    "all_data['power'] = all_data['power'].apply(lambda x: 600 if x > 600 else x)\n",
    "\n",
    "# 处理日期相关信息\n",
    "all_data['reg_year'] = all_data['regDate'].apply(lambda x: int(str(x)[:4]))\n",
    "all_data['reg_month'] = all_data['regDate'].apply(lambda x: int(str(x)[4:6]))\n",
    "all_data['reg_day'] = all_data['regDate'].apply(lambda x: int(str(x)[6:]))\n",
    "all_data['creat_year'] = all_data['creatDate'].apply(lambda x: int(str(x)[:4]))\n",
    "all_data['creat_month'] = all_data['creatDate'].apply(lambda x: int(str(x)[4:6]))\n",
    "all_data['creat_day'] = all_data['creatDate'].apply(lambda x: int(str(x)[6:]))\n",
    "\n",
    "# 标记汽车没有经过维修\n",
    "all_data['notRepairedDamage'] = all_data['notRepairedDamage'].apply(lambda x: 0 if x == '-' else 1)\n",
    "\n",
    "# 对可分类的连续特征进行分桶，如将功率（power）分成10个分桶，并提取新特征\n",
    "all_data['power_bucket'] = pd.cut(all_data['power'], 10, labels=False)\n",
    "new_cols = ['power_bucket', 'v_0', 'v_3', 'v_7', 'v_11','v_14'] \n",
    "for col1 in new_cols:\n",
    "    for col2 in new_cols:\n",
    "        if col1 != col2:\n",
    "            all_data['{}_{}_add'.format(col1, col2)] = all_data[col1] + all_data[col2]\n",
    "            all_data['{}_{}_sub'.format(col1, col2)] = all_data[col1] - all_data[col2]\n",
    "\n",
    "# 处理缺失值\n",
    "all_data['fuelType'] = all_data['fuelType'].fillna(0)\n",
    "all_data['gearbox'] = all_data['gearbox'].fillna(0)\n",
    "all_data['bodyType'] = all_data['bodyType'].fillna(0)\n",
    "all_data['model'] = all_data['model'].fillna(0)\n",
    "\n",
    "# 分离特征和标签\n",
    "train_data = all_data[~all_data['price'].isnull()]\n",
    "test_data = all_data[all_data['price'].isnull()]\n",
    "X_train = train_data.drop(['SaleID', 'name', 'regDate', 'creatDate','seller','offerType','power', 'price'], axis=1)\n",
    "X_test = test_data.drop(['SaleID', 'name', 'regDate', 'creatDate','seller','offerType','power','price'], axis=1)\n",
    "y_train = train_data['price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training on Fold 1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\sklearn.py:797: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.\n",
      "  UserWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation_0-mae:7.16525\n",
      "[200]\tvalidation_0-mae:0.12985\n",
      "[400]\tvalidation_0-mae:0.12600\n",
      "[600]\tvalidation_0-mae:0.12414\n",
      "[800]\tvalidation_0-mae:0.12314\n",
      "[999]\tvalidation_0-mae:0.12249\n",
      "Training on Fold 2\n",
      "[0]\tvalidation_0-mae:7.16276\n",
      "[200]\tvalidation_0-mae:0.13202\n",
      "[400]\tvalidation_0-mae:0.12791\n",
      "[600]\tvalidation_0-mae:0.12604\n",
      "[800]\tvalidation_0-mae:0.12503\n",
      "[999]\tvalidation_0-mae:0.12427\n",
      "Training on Fold 3\n",
      "[0]\tvalidation_0-mae:7.13761\n",
      "[200]\tvalidation_0-mae:0.13119\n",
      "[400]\tvalidation_0-mae:0.12743\n",
      "[600]\tvalidation_0-mae:0.12559\n",
      "[800]\tvalidation_0-mae:0.12469\n",
      "[999]\tvalidation_0-mae:0.12395\n",
      "Training on Fold 4\n",
      "[0]\tvalidation_0-mae:7.15918\n",
      "[200]\tvalidation_0-mae:0.12995\n",
      "[400]\tvalidation_0-mae:0.12608\n",
      "[600]\tvalidation_0-mae:0.12437\n",
      "[800]\tvalidation_0-mae:0.12314\n",
      "[999]\tvalidation_0-mae:0.12241\n",
      "Training on Fold 5\n",
      "[0]\tvalidation_0-mae:7.16851\n",
      "[200]\tvalidation_0-mae:0.12914\n",
      "[400]\tvalidation_0-mae:0.12546\n",
      "[600]\tvalidation_0-mae:0.12382\n",
      "[800]\tvalidation_0-mae:0.12275\n",
      "[999]\tvalidation_0-mae:0.12213\n",
      "MAE: 526.937\n"
     ]
    }
   ],
   "source": [
    "# 定义模型参数\n",
    "xgb_model = XGBRegressor(\n",
    "    max_depth=10,\n",
    "    learning_rate=0.05,\n",
    "    n_estimators=1000,\n",
    "    gamma=0.005,\n",
    "    subsample=0.7,\n",
    "    colsample_bytree=0.6,\n",
    "    objective='reg:squarederror',\n",
    "    n_jobs=-1,\n",
    "    random_state=17,\n",
    "    eval_metric='mae'\n",
    ")\n",
    "\n",
    "# 交叉验证以及训练模型\n",
    "skf = KFold(n_splits=5, shuffle=True, random_state=17)\n",
    "oof = np.zeros(len(X_train))\n",
    "test_predict = np.zeros(len(X_test))\n",
    "for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):\n",
    "    print(\"Training on Fold {}\".format(i+1))\n",
    "    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]\n",
    "    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]\n",
    "    xgb_model.fit(\n",
    "        tr_x, tr_y,\n",
    "        eval_set=[(vl_x, vl_y)],\n",
    "        early_stopping_rounds=100,\n",
    "        verbose=200\n",
    "    )\n",
    "\n",
    "    oof[val_index] = xgb_model.predict(vl_x)\n",
    "    test_predict += xgb_model.predict(X_test) / skf.n_splits\n",
    "\n",
    "mae = mean_absolute_error(np.expm1(y_train), np.expm1(oof))\n",
    "print(\"MAE: {:.3f}\".format(mae))\n",
    "\n",
    "# 保存预测结果\n",
    "res = pd.DataFrame({'SaleID': test_data['SaleID'], 'price': np.expm1(test_predict)})\n",
    "res.to_csv('xgb_submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义网格搜索的参数范围\n",
    "param_grid = {\n",
    "    'learning_rate': [0.03, 0.05],\n",
    "    'n_estimators': [500, 1000]\n",
    "}\n",
    "\n",
    "# 创建XGBRegressor模型实例（初始参数随意给定，后续通过网格搜索确定最优参数）\n",
    "xgb_model = XGBRegressor(\n",
    "    max_depth=10,\n",
    "    n_estimators=1000,\n",
    "    objective='reg:squarederror',\n",
    "    n_jobs=-1,\n",
    "    subsample=0.7,\n",
    "    gamma=0.005,\n",
    "    colsample_bytree=0.6,\n",
    "    random_state=17,\n",
    "    eval_metric='mae'\n",
    ")\n",
    "\n",
    "\n",
    "# 交叉验证以及进行网格搜索调参\n",
    "skf = KFold(n_splits=5, shuffle=True, random_state=17)\n",
    "best_mae = float('inf')\n",
    "best_model = None\n",
    "\n",
    "for train_index, val_index in skf.split(X_train, y_train):\n",
    "    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]\n",
    "    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]\n",
    "\n",
    "    grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error')\n",
    "    grid_search.fit(tr_x, tr_y)\n",
    "    # 获取当前折叠下最优模型在验证集上的预测结果\n",
    "    current_model = grid_search.best_estimator_\n",
    "    val_pred = current_model.predict(vl_x)\n",
    "    current_mae = mean_absolute_error(np.expm1(vl_y), np.expm1(val_pred))\n",
    "\n",
    "    # 更新最优MAE对应的模型\n",
    "    if current_mae < best_mae:\n",
    "        best_mae = current_mae\n",
    "        best_model = current_model\n",
    "\n",
    "# 使用找到的最优模型在整个训练集上重新训练\n",
    "best_model.fit(X_train, y_train)\n",
    "\n",
    "# 使用最优模型对测试集进行预测\n",
    "test_predict = best_model.predict(X_test)\n",
    "\n",
    "# 保存预测结果\n",
    "# submission = pd.DataFrame({'SaleID': test_data['SaleID'], 'price': np.expm1(test_predict)})\n",
    "# submission.to_csv('xgb_submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算并输出最终的MAE值\n",
    "mae = mean_absolute_error(np.expm1(y_train), np.expm1(best_model.predict(X_train)))\n",
    "print(\"最终MAE: {:.3f}\".format(mae))\n",
    "# 输出最优模型的参数\n",
    "print(\"最优模型参数：\", best_model.get_params())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-15-f3430333db0c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;31m# 第一步：直接在整个训练集上进行网格搜索，获取局部最优参数\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     20\u001b[0m \u001b[0mgrid_search\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxgb_model\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_grid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'neg_mean_absolute_error'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m \u001b[0mgrid_search\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     22\u001b[0m \u001b[0mbest_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[0;32m    685\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    686\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 687\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    688\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    689\u001b[0m         \u001b[1;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m   1146\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1147\u001b[0m         \u001b[1;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1148\u001b[1;33m         \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1149\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[1;34m(candidate_params)\u001b[0m\n\u001b[0;32m    664\u001b[0m                                \u001b[1;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    665\u001b[0m                                in product(candidate_params,\n\u001b[1;32m--> 666\u001b[1;33m                                           cv.split(X, y, groups)))\n\u001b[0m\u001b[0;32m    667\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    668\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m    922\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    923\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 924\u001b[1;33m             \u001b[1;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    925\u001b[0m                 \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    926\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[1;34m(self, iterator)\u001b[0m\n\u001b[0;32m    757\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    758\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 759\u001b[1;33m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    760\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    761\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m    714\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    715\u001b[0m             \u001b[0mjob_idx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 716\u001b[1;33m             \u001b[0mjob\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    717\u001b[0m             \u001b[1;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    718\u001b[0m             \u001b[1;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[1;34m(self, func, callback)\u001b[0m\n\u001b[0;32m    180\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    181\u001b[0m         \u001b[1;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 182\u001b[1;33m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    183\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    184\u001b[0m             \u001b[0mcallback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m    547\u001b[0m         \u001b[1;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    548\u001b[0m         \u001b[1;31m# arguments in memory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    550\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    551\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    223\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m    226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    227\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m    223\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m    226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    227\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[0;32m    512\u001b[0m             \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    513\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 514\u001b[1;33m             \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    515\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    516\u001b[0m     \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\core.py\u001b[0m in \u001b[0;36minner_f\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    573\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msig\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    574\u001b[0m             \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 575\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    576\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    577\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)\u001b[0m\n\u001b[0;32m    970\u001b[0m             \u001b[0mverbose_eval\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    971\u001b[0m             \u001b[0mxgb_model\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 972\u001b[1;33m             \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    973\u001b[0m         )\n\u001b[0;32m    974\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\core.py\u001b[0m in \u001b[0;36minner_f\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    573\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msig\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    574\u001b[0m             \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 575\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    576\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    577\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\training.py\u001b[0m in \u001b[0;36mtrain\u001b[1;34m(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)\u001b[0m\n\u001b[0;32m    179\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcb_container\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbefore_iteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevals\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    180\u001b[0m             \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 181\u001b[1;33m         \u001b[0mbst\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    182\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcb_container\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mafter_iteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevals\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    183\u001b[0m             \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda3\\lib\\site-packages\\xgboost\\core.py\u001b[0m in \u001b[0;36mupdate\u001b[1;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[0;32m   1778\u001b[0m             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,\n\u001b[0;32m   1779\u001b[0m                                                     \u001b[0mctypes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mc_int\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miteration\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1780\u001b[1;33m                                                     dtrain.handle))\n\u001b[0m\u001b[0;32m   1781\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1782\u001b[0m             \u001b[0mpred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_margin\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraining\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "param_grid = {\n",
    "    'max_depth':[7,10,13],\n",
    "    'learning_rate': [0.03, 0.05],\n",
    "    'n_estimators': [500, 1000]\n",
    "}\n",
    "\n",
    "# 创建XGBRegressor模型实例（初始参数随意给定，后续通过网格搜索确定最优参数）\n",
    "xgb_model = XGBRegressor(\n",
    "    n_estimators=1000,\n",
    "    objective='reg:squarederror',\n",
    "    n_jobs=-1,\n",
    "    subsample=0.7,\n",
    "    gamma=0.005,\n",
    "    colsample_bytree=0.6,\n",
    "    random_state=17,\n",
    "    eval_metric='mae'\n",
    ")\n",
    "\n",
    "# 第一步：直接在整个训练集上进行网格搜索，获取局部最优参数\n",
    "grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error')\n",
    "grid_search.fit(X_train, y_train)\n",
    "best_model = grid_search.best_estimator_\n",
    "\n",
    "# 第二步：使用得到的最优模型进行五折交叉验证，评估模型泛化能力\n",
    "skf = KFold(n_splits=5, shuffle=True, random_state=17)\n",
    "validation_scores = []\n",
    "for train_index, val_index in skf.split(X_train, y_train):\n",
    "    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]\n",
    "    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]\n",
    "\n",
    "    val_pred = best_model.predict(vl_x)\n",
    "    mae = mean_absolute_error(np.expm1(vl_y), np.expm1(val_pred))\n",
    "    validation_scores.append(mae)\n",
    "\n",
    "print(\"五折交叉验证平均 MAE:\", np.mean(validation_scores))\n",
    "\n",
    "# 第三步：使用最优模型对测试集进行预测\n",
    "test_predict = best_model.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = KFold(n_splits=5, shuffle=True, random_state=17)\n",
    "validation_scores = []\n",
    "for train_index, val_index in skf.split(X_train, y_train):\n",
    "    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]\n",
    "    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]\n",
    "\n",
    "    val_pred = best_model.predict(vl_x)\n",
    "    mae = mean_absolute_error(np.expm1(vl_y), np.expm1(val_pred))\n",
    "    validation_scores.append(mae)\n",
    "\n",
    "print(\"五折交叉验证平均 MAE:\", np.mean(validation_scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存预测结果，并输出提示信息\n",
    "submission = pd.DataFrame({'SaleID': test_data['SaleID'], 'price': np.expm1(test_predict)})\n",
    "submission.to_csv('xgb_submission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
